Word Embedding: is the collective name for a set of language modeling and feature learning techniques in natural language processing (NLP) where words or phrases from the vocabulary are mapped to vectors of real numbers. Conceptually it involves a mathematical embedding from a space with many dimensions per word to a continuous vector space with a much lower dimension. Source
Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora. Target audience is the natural language processing (NLP) and information retrieval (IR) community.
# Load Python libraries
import io
import re
import pandas as pd
import random
import numpy as np
from collections import Counter
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# Load NLP libraries from gensim and spacy
from gensim.models import Word2Vec
import spacy.lang.en as en
# Load Plot libraries
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Util function to read a plain text file
def read_text_file(file_path):
text = ""
with io.open(file_path, 'r', encoding = 'ISO-8859-1') as f:
text = f.read()
return text;
# Get text sample
file_path = "../data/en/The Adventures of Sherlock Holmes - Arthur Conan Doyle.txt"
plain_text = read_text_file(file_path)
len(plain_text)
# Show first 1000 characters of document
plain_text[:1000]
Data Quality process: refers to the cleaning process of input data so they have meaning and value.
# Cleaing the text
clean_text = plain_text.lower()
clean_text = clean_text.replace('\n', '.')
clean_text = re.sub('[^a-zA-Z.]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)
clean_text = re.sub(r'\.+', ".", clean_text)
clean_text[:1000]
# Tokenize text in sentences
sentence_list = clean_text.split('.')
len(sentence_list)
# Tokenize sentences in words
word_list = [sentence.split() for sentence in sentence_list if len(sentence.split()) > 0]
word_list[:10]
# Count the words in a document and return the most N repeated
def count_words(sentences, n):
words = Counter()
for sent in sentences:
for word in sent:
words[word] += 1
return words.most_common(n)
# Get the most common words in the document
n_words = count_words(word_list, 50)
df = pd.DataFrame.from_records(n_words, columns = ['word', 'quantity'])
df.head(10)
# Plot the most common words in the document
fig = plt.figure(figsize = (18, 6))
sns.barplot(x = 'word', y = 'quantity', data = df)
plt.title('The 50 Most Common Words in document')
plt.show()
- Stopwords: refers to the most common words in a language, which do not significantly affect the meaning of the text.
# Get English stopwords
stopwords_en = en.stop_words.STOP_WORDS
print(stopwords_en)
# Remove stopwords
all_words = []
for ix in range(len(word_list)):
all_words.append([word for word in word_list[ix] if (word not in stopwords_en and len(word) > 2)])
all_words[:10]
# Get the most common words in the document after removing the stopwords
n_words = count_words(all_words, 50)
df = pd.DataFrame.from_records(n_words, columns = ['word', 'quantity'])
df.head(10)
# Plot the most common words in the document after removing the stopwords
fig = plt.figure(figsize = (18, 6))
sns.barplot(x = 'word', y = 'quantity', data = df)
plt.title('The 50 Most Common Words in document')
plt.show()
# Reconstructing the clean text (without stop-words)
new_clean_text = ''
for sent in all_words:
for word in sent:
new_clean_text = new_clean_text + word + ' '
# Custom color function
def color_func(word, font_size, position, orientation, random_state = None, **kwargs):
return "hsl(45, 150%%, %d%%)" % random.randint(160, 255)
# Create a Word cloud
wc = WordCloud(max_font_size = 60, min_font_size = 5, max_words = 150, background_color = "black", margin = 2)
wc = wc.generate(new_clean_text)
# Plot a Word cloud
plt.figure(figsize = (12, 12))
plt.imshow(wc.recolor(color_func = color_func, random_state=3), interpolation = "bilinear")
plt.axis("off")
plt.show()
- Word2Vec consists of models for generating word embedding. These models are shallow two layer neural networks having one input layer, one hidden layer and one output layer. Word2Vec utilizes two architectures: CBOW (Continuous Bag of Words) and Skip Gram. Source
# Algorithm params
min_count = 3 # Minimium frequency count of words. The model would ignore words that do not satisfy the min_count
size = 150 # The size of the dense vector to represent each token or word
window = 5 # The maximum distance between the target word and its neighboring word
sg = 0 # Skip Gram algorithm = False. Continuous Bag of Words = True
iter = 10 # Number of iterations (epochs) over the corpus
# Create Word2Vec model
w2v_model = Word2Vec(all_words, min_count = min_count, size = size, window = window, sg = sg, iter = iter)
- Vocabulary: unique words of the document.
# Show vocabulary size: unique words occurring at least twice
vocabulary = w2v_model.wv.vocab
len(vocabulary)
# Show 'holmes' vector
w2v_model.wv['holmes']
- Similar Words: Words more similar in terms of meaning and context.
# Finding Positive Similar Words
w2v_model.wv.most_similar(positive = ['holmes'], topn = 10)
# Finding Negative Similar Words
w2v_model.wv.most_similar(negative = ['holmes'], topn = 10)
# Calculate the similarity between 2 words
w2v_model.wv.similarity(w1 = 'holmes', w2 = 'watson')
# Calculate similarity: sim(w1, w2) = sim(w2, w1)
w2v_model.wv.similarity(w1 = 'watson', w2 = 'holmes')
# Show word that doesn't belong to the list
w2v_model.wv.doesnt_match(['holmes', 'watson', 'mycroft'])
# Get vectors
target_word = 'sherlock'
top_n = 25
# Calculate more and less similars words
most_similar = w2v_model.wv.most_similar(positive = [target_word], topn = top_n)
less_similar = w2v_model.wv.most_similar(negative = [target_word], topn = top_n)
# Save them
neighbors = [(target_word, 1, 'current')]
neighbors += [(*row, 'most') for row in most_similar]
neighbors += [(*row, 'less') for row in less_similar]
# Get neighbors vectos
neigh_word = [row[0] for row in neighbors]
X = w2v_model[neigh_word]
len(X)
# Perform PCA with 2 components
pca = PCA(n_components = 2)
pca_data = pca.fit_transform(X)
# The explained variance of each principal components
list(pca.explained_variance_ratio_)
# Create and show principal components DataFrame
pca_df = pd.DataFrame(data = pca_data, columns = ["PC1", "PC2"])
pca_df['Name'] = neigh_word
pca_df.head(10)
# Create a scatter plot of the projection
fig, ax = plt.subplots(figsize = (16, 16))
gap = 0.001
colors = dict()
colors['current'] = 'royalblue'
colors['most'] = 'forestgreen'
colors['less'] = 'orange'
# Add points one by one with a loop
for i, word in enumerate(neigh_word):
node_col = colors[neighbors[i][2]]
if word == target_word:
node_size = 100
text = word.upper()
else:
node_size = 50
text = word + ': ' + str(round(neighbors[i][1], 3))
plt.scatter(pca_data[i, 0], pca_data[i, 1], c = node_col, s = node_size)
plt.annotate(text, xy = (pca_data[i, 0] + gap*30, pca_data[i, 1] - gap/3))
# Plot setup
ax.set_xlabel("PC 1", fontsize = 12)
ax.set_ylabel("PC 2", fontsize = 12)
ax.set_title("Most Similar Words to " + target_word, fontsize = 20)
ax.legend(["Similar Words"])
ax.grid()
# Create Word2Vec model
w2v_model = Word2Vec(all_words, min_count = 5, size = 100, window = 5, sg = 0)
vocabulary = w2v_model.wv.vocab
len(vocabulary)
# Returns the dense similarity between all the words in the document
def get_dense_similarity(model, precision = 3):
words_sim = []
vocabulary = model.wv.vocab
for w1 in vocabulary:
row_sim = []
word_sim = 0
for w2 in vocabulary:
word_sim = model.wv.similarity(w1 = w1, w2 = w2)
row_sim.append(round(word_sim, precision))
words_sim.append(row_sim)
return words_sim;
# Create dataframe with the similarity between all the words in the document
words_sim = get_dense_similarity(w2v_model, 2)
df_dense = pd.DataFrame.from_records(words_sim, columns = vocabulary)
print(df_dense.shape)
df_dense.iloc[:18, :18]
# Plot dense similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Dense Similarity Matrix", fontsize = 16)
ax.set_xlabel('vocabulary', fontsize = 12)
ax.set_ylabel('vocabulary', fontsize = 12)
plt.show()
# Exporting dense word similarity matrix
file_path = "../data/network/dense_similarity.csv"
df_dense.to_csv(file_path, index = False, sep = ',')
# Returns the sparse similarity between all the words in the document
def get_sparse_similarity(model, precision = 3, top_n = 10):
matrix = []
vocabulary = list(model.wv.vocab.keys())
n_words = len(vocabulary)
# Calculate sparse similarity
for word in vocabulary:
row_sim = np.zeros(n_words)
best_sim = w2v_model.wv.most_similar(positive = [word], topn = top_n)
for neighbor in best_sim:
nei_name = neighbor[0]
nei_ix = vocabulary.index(nei_name)
nei_sim = round(neighbor[1], precision)
row_sim[nei_ix] = nei_sim
matrix.append(row_sim)
return matrix, vocabulary;
# Create a data frame with the similarity between the nearest words
words_sim, vocabulary = get_sparse_similarity(w2v_model, 2, 50)
df_sparse = pd.DataFrame.from_records(words_sim, columns = vocabulary)
print(df_sparse.shape)
df_sparse.iloc[:18, :18]
# Plot sparse similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Sparse Similarity Matrix", fontsize = 16)
ax.set_xlabel('vocabulary', fontsize = 12)
ax.set_ylabel('vocabulary', fontsize = 12)
plt.show()
# Exporting sparse word similarity matrix
file_path = "../data/network/sparse_similarity.csv"
df_sparse.to_csv(file_path, index = False, sep = ',')